import re
import jieba
from collections import Counter
import pymysql

# 读取新闻标题文件
with open(f'通过接口获取数据\\news_titles.txt', 'r', encoding='utf-8') as f:
    news_titles = f.readlines()

# 将新闻标题连接成一个字符串
news_str = ' '.join(news_titles)

# 使用正则表达式去除标点符号和数字
news_str = re.sub(r'[^\w\s]|\d', '', news_str)

# 将字符串转换为小写字母
news_str = news_str.lower()

# 使用 jieba 进行分词，并去除停用词
stopwords = [line.strip() for line in open('通过接口获取数据\\stopwords.txt', 'r', encoding='utf-8').readlines()] # 停用词表
words = jieba.lcut(news_str)
words = [word for word in words if word not in stopwords]

# 对分词结果进行词频统计
word_counts = Counter(words)
hot_words = word_counts.most_common(30) # 选取出现频率最高的前30个词汇

# 将热门词汇写入 MySQL 数据库
conn = pymysql.connect(host="localhost", user="root", password="123456", db="bs", charset="utf8")
cursor = conn.cursor()
for word, count in hot_words:
    sql = "INSERT INTO word (word, count) VALUES (%s, %s)"
    cursor.execute(sql, (word, count))
conn.commit()
cursor.close()
conn.close()

print('处理完毕!')